import pandas as pd
pd.options.display.max_columns = 999
train= pd.read_csv('C:/Users/mosto/REPOSITORY/Prediction_of_Insurance_Premium/data/Train.csv', index_col=False)
test= pd.read_csv('C:/Users/mosto/REPOSITORY/Prediction_of_Insurance_Premium/data/Test.csv', index_col=False)
val= pd.read_csv('C:/Users/mosto/REPOSITORY/Prediction_of_Insurance_Premium/data/Validation.csv', index_col=False)
print(train.shape)
print(test.shape)
print(val.shape)
train.head()
import plotly.express as px
px.scatter(train,
x = "Months Since Last Claim",
y = "Monthly Premium Auto",
trendline="ols")
# Explore relationship between month since last claim and insurance premium
# if month since last claim is 0, that means you recently had a claim
# if month since last claim is 35, that means you haven't had a clain in 35 months.
px.scatter(train,
x = "Months Since Last Claim",
y = "Customer Lifetime Value",
trendline="ols")
px.scatter(train,
x="Customer Lifetime Value",
y="Monthly Premium Auto",
trendline="ols",
color="Monthly Premium Auto")
# a sample of the data.
train_sample = train.sample(n=100, random_state=42)
train_sample.shape
px.scatter_3d(train_sample,
x="Months Since Last Claim",
y="Customer Lifetime Value",
z="Monthly Premium Auto",
color="Months Since Last Claim")
train_sample.columns.to_list()
# Look at relationship between month since policy inception, and premium price
px.scatter(train_sample, x="Months Since Policy Inception", y="Monthly Premium Auto")
# Look at relationship between month since policy inception, and customer lifetime value
px.scatter(train_sample, x="Months Since Policy Inception", y="Customer Lifetime Value")
import seaborn as sns
# using seaborn to see from a high level perspective what the relationship between multiple variables.
sns.pairplot(train_sample)
a = sns.pairplot(train_sample, vars=['Customer Lifetime Value',
"Coverage",
"Education",
"Income",
"Monthly Premium Auto",
'Months Since Last Claim',
"Months Since Policy Inception",
"Number of Open Complaints",
"Number of Policies",
"Total Claim Amount",
"Vehicle Size"])
b = sns.pairplot(train_sample, vars=['Customer Lifetime Value',
"Income",
"Monthly Premium Auto",
'Months Since Last Claim',
"Months Since Policy Inception",
"Total Claim Amount"])
# look at total claim amount and monthly premium auto
px.scatter(train_sample, x='Monthly Premium Auto', y='Total Claim Amount')